kreuzberg 2.1.1__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,5 +1,10 @@
1
- from ._tesseract import PSMMode
2
- from ._types import ExtractionResult, Metadata
1
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
2
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
3
+ from kreuzberg._ocr._tesseract import TesseractConfig
4
+
5
+ from ._ocr._tesseract import PSMMode
6
+ from ._registry import ExtractorRegistry
7
+ from ._types import ExtractionConfig, ExtractionResult, Metadata
3
8
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
4
9
  from .extraction import (
5
10
  batch_extract_bytes,
@@ -7,22 +12,31 @@ from .extraction import (
7
12
  batch_extract_file,
8
13
  batch_extract_file_sync,
9
14
  extract_bytes,
15
+ extract_bytes_sync,
10
16
  extract_file,
17
+ extract_file_sync,
11
18
  )
12
19
 
13
20
  __all__ = [
21
+ "EasyOCRConfig",
22
+ "ExtractionConfig",
14
23
  "ExtractionResult",
24
+ "ExtractorRegistry",
15
25
  "KreuzbergError",
16
26
  "Metadata",
17
27
  "MissingDependencyError",
18
28
  "OCRError",
19
29
  "PSMMode",
30
+ "PaddleOCRConfig",
20
31
  "ParsingError",
32
+ "TesseractConfig",
21
33
  "ValidationError",
22
34
  "batch_extract_bytes",
23
35
  "batch_extract_bytes_sync",
24
36
  "batch_extract_file",
25
37
  "batch_extract_file_sync",
26
38
  "extract_bytes",
39
+ "extract_bytes_sync",
27
40
  "extract_file",
41
+ "extract_file_sync",
28
42
  ]
kreuzberg/_chunker.py ADDED
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from kreuzberg import MissingDependencyError
6
+ from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
7
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
8
+
9
+ if TYPE_CHECKING:
10
+ from semantic_text_splitter import MarkdownSplitter, TextSplitter
11
+
12
+ _chunkers: dict[tuple[int, int, str], MarkdownSplitter | TextSplitter] = {}
13
+
14
+
15
+ def get_chunker(
16
+ mime_type: str,
17
+ max_characters: int = DEFAULT_MAX_CHARACTERS,
18
+ overlap_characters: int = DEFAULT_MAX_OVERLAP,
19
+ ) -> MarkdownSplitter | TextSplitter:
20
+ """Creates and returns a Chunker object configured with the given maximum
21
+ characters per chunk and overlap between chunks.
22
+
23
+ Args:
24
+ mime_type: The mime type of the content.
25
+ max_characters: Maximum number of characters allowed in each chunk.
26
+ overlap_characters: Number of characters overlapping between two consecutive chunks.
27
+
28
+ Raises:
29
+ MissingDependencyError: if semantic-text-splitter is not installed.
30
+
31
+ Returns:
32
+ Chunker: A Chunker object configured with the specified maximum
33
+ characters and overlap.
34
+ """
35
+ key = (max_characters, overlap_characters, mime_type)
36
+ if key not in _chunkers:
37
+ try:
38
+ if mime_type == MARKDOWN_MIME_TYPE:
39
+ from semantic_text_splitter import MarkdownSplitter
40
+
41
+ _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
42
+ else:
43
+ from semantic_text_splitter import TextSplitter
44
+
45
+ _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
+ except ImportError as e:
47
+ raise MissingDependencyError.create_for_package(
48
+ dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
49
+ ) from e
50
+
51
+ return _chunkers[key]
kreuzberg/_constants.py CHANGED
@@ -1,8 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from multiprocessing import cpu_count
4
3
  from typing import Final
5
4
 
6
- DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
7
- MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
8
5
  MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
6
+ DEFAULT_MAX_CHARACTERS: Final[int] = 2000
7
+ DEFAULT_MAX_OVERLAP: Final[int] = 100
kreuzberg/_mime_types.py CHANGED
@@ -16,7 +16,7 @@ PDF_MIME_TYPE: Final = "application/pdf"
16
16
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
17
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
18
  DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
19
- # Excel formats
19
+
20
20
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
21
21
  EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
22
22
  EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
@@ -24,8 +24,8 @@ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macr
24
24
  EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
25
25
  EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
26
26
 
27
- # OpenDocument spreadsheet format
28
- OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet" # ods
27
+
28
+ OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
29
29
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
30
30
 
31
31
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -48,26 +48,7 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
48
48
  "image/x-portable-pixmap",
49
49
  "image/x-tiff",
50
50
  }
51
- IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
52
- "image/bmp": "bmp",
53
- "image/x-bmp": "bmp",
54
- "image/x-ms-bmp": "bmp",
55
- "image/gif": "gif",
56
- "image/jpeg": "jpg",
57
- "image/pjpeg": "jpg",
58
- "image/png": "png",
59
- "image/tiff": "tiff",
60
- "image/x-tiff": "tiff",
61
- "image/jp2": "jp2",
62
- "image/jpx": "jpx",
63
- "image/jpm": "jpm",
64
- "image/mj2": "mj2",
65
- "image/webp": "webp",
66
- "image/x-portable-anymap": "pnm",
67
- "image/x-portable-bitmap": "pbm",
68
- "image/x-portable-graymap": "pgm",
69
- "image/x-portable-pixmap": "ppm",
70
- }
51
+
71
52
  PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
72
53
  "application/csl+json",
73
54
  "application/docbook+xml",
@@ -162,13 +143,17 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
162
143
  )
163
144
 
164
145
 
165
- def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
146
+ def validate_mime_type(
147
+ *, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
148
+ ) -> str:
166
149
  """Validate and detect the MIME type for a given file.
167
150
 
168
151
  Args:
169
152
  file_path: The path to the file.
170
153
  mime_type: Optional explicit MIME type. If provided, this will be validated.
171
154
  If not provided, the function will attempt to detect the MIME type.
155
+ check_file_exists: Whether to check if the file exists. Default is True.
156
+ Set to False in tests where you want to validate a mime type without an actual file.
172
157
 
173
158
  Raises:
174
159
  ValidationError: If the MIME type is not supported or cannot be determined.
@@ -176,10 +161,18 @@ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = N
176
161
  Returns:
177
162
  The validated MIME type.
178
163
  """
179
- path = Path(file_path)
164
+ if file_path and check_file_exists:
165
+ path = Path(file_path)
166
+ if not path.exists():
167
+ raise ValidationError("The file does not exist", context={"file_path": str(path)})
180
168
 
181
169
  if not mime_type:
182
- # Try to determine MIME type from file extension first
170
+ if not file_path:
171
+ raise ValidationError(
172
+ "Could not determine mime type.",
173
+ )
174
+ path = Path(file_path)
175
+
183
176
  ext = path.suffix.lower()
184
177
  mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
185
178
 
kreuzberg/_playa.py ADDED
@@ -0,0 +1,276 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import TYPE_CHECKING, Any, cast
5
+
6
+ from playa import asobj, parse
7
+ from playa.utils import decode_text
8
+
9
+ from kreuzberg.exceptions import ParsingError
10
+
11
+ if TYPE_CHECKING:
12
+ from playa.document import Document
13
+
14
+ from kreuzberg._types import Metadata
15
+
16
+
17
+ GRAY_COMPONENTS = 1
18
+ RGB_COMPONENTS = 3
19
+ CMYK_COMPONENTS = 4
20
+ UTF16BE_BOM = b"\xfe\xff"
21
+ UTF16BE_ENCODING = "utf-16be"
22
+ MIN_DATE_LENGTH = 8
23
+ FULL_DATE_LENGTH = 14
24
+ BOM_CHAR = "\ufeff"
25
+
26
+
27
+ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
28
+ """Extract metadata from a PDF document.
29
+
30
+ Args:
31
+ pdf_content: The bytes of the PDF document.
32
+
33
+ Raises:
34
+ ParsingError: If the PDF metadata could not be extracted.
35
+
36
+ Returns:
37
+ A dictionary of metadata extracted from the PDF.
38
+ """
39
+ try:
40
+ document = parse(pdf_content, max_workers=1)
41
+ metadata: Metadata = {}
42
+
43
+ for raw_info in document.info:
44
+ pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
45
+ _extract_basic_metadata(pdf_info, metadata)
46
+ _extract_author_metadata(pdf_info, metadata)
47
+ _extract_keyword_metadata(pdf_info, metadata)
48
+ _extract_category_metadata(pdf_info, metadata)
49
+ _extract_date_metadata(pdf_info, metadata)
50
+ _extract_creator_metadata(pdf_info, metadata)
51
+
52
+ if document.pages:
53
+ _extract_document_dimensions(document, metadata)
54
+
55
+ if document.outline and "description" not in metadata:
56
+ metadata["description"] = _generate_outline_description(document)
57
+
58
+ if "summary" not in metadata:
59
+ metadata["summary"] = _generate_document_summary(document)
60
+
61
+ _extract_structure_information(document, metadata)
62
+
63
+ return metadata
64
+ except Exception as e:
65
+ raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e
66
+
67
+
68
+ def _extract_basic_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
69
+ if "title" not in result and (title := pdf_info.get("title")):
70
+ result["title"] = decode_text(title)
71
+
72
+ if "subject" not in result and (subject := pdf_info.get("subject")):
73
+ result["subject"] = decode_text(subject)
74
+
75
+ if "publisher" not in result and (publisher := pdf_info.get("Publisher", pdf_info.get("publisher"))):
76
+ result["publisher"] = decode_text(publisher)
77
+
78
+ if "copyright" not in result and (copyright_info := pdf_info.get("copyright") or pdf_info.get("rights")):
79
+ result["copyright"] = decode_text(copyright_info)
80
+
81
+ if "comments" not in result and (comments := pdf_info.get("comments")):
82
+ result["comments"] = decode_text(comments)
83
+
84
+ if "identifier" not in result and (identifier := pdf_info.get("identifier") or pdf_info.get("id")):
85
+ result["identifier"] = decode_text(identifier)
86
+
87
+ if "license" not in result and (license_info := pdf_info.get("license")):
88
+ result["license"] = decode_text(license_info)
89
+
90
+ if "modified_by" not in result and (modified_by := pdf_info.get("modifiedby") or pdf_info.get("last_modified_by")):
91
+ result["modified_by"] = decode_text(modified_by)
92
+
93
+ if "version" not in result and (version := pdf_info.get("version")):
94
+ result["version"] = decode_text(version)
95
+
96
+
97
+ def _extract_author_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
98
+ if author := pdf_info.get("author"):
99
+ if isinstance(author, (str, bytes)):
100
+ author_str = decode_text(author)
101
+ author_str = author_str.replace(" and ", ", ")
102
+
103
+ authors = []
104
+ for author_segment in author_str.split(";"):
105
+ authors.extend(
106
+ [author_name.strip() for author_name in author_segment.split(",") if author_name.strip()]
107
+ )
108
+ result["authors"] = authors
109
+ elif isinstance(author, list):
110
+ result["authors"] = [decode_text(a) for a in author]
111
+
112
+
113
+ def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
114
+ if keywords := pdf_info.get("keywords"):
115
+ if isinstance(keywords, (str, bytes)):
116
+ kw_str = decode_text(keywords)
117
+ kw_list = [k.strip() for k in kw_str.split(",")]
118
+ kw_list = [k.strip() for k in " ".join(kw_list).split(";")]
119
+ result["keywords"] = [k for k in kw_list if k]
120
+ elif isinstance(keywords, list):
121
+ result["keywords"] = [decode_text(k) for k in keywords]
122
+
123
+
124
+ def _extract_category_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
125
+ if categories := pdf_info.get("categories") or pdf_info.get("category"):
126
+ if isinstance(categories, (str, bytes)):
127
+ cat_str = decode_text(categories)
128
+ cat_list = [c.strip() for c in cat_str.split(",")]
129
+ result["categories"] = [c for c in cat_list if c]
130
+ elif isinstance(categories, list):
131
+ result["categories"] = [decode_text(c) for c in categories]
132
+
133
+
134
+ def _parse_date_string(date_str: str) -> str:
135
+ date_str = date_str.removeprefix("D:")
136
+ if len(date_str) >= MIN_DATE_LENGTH:
137
+ year = date_str[0:4]
138
+ month = date_str[4:6]
139
+ day = date_str[6:8]
140
+ time_part = ""
141
+ if len(date_str) >= FULL_DATE_LENGTH:
142
+ hour = date_str[8:10]
143
+ minute = date_str[10:12]
144
+ second = date_str[12:14]
145
+ time_part = f"T{hour}:{minute}:{second}"
146
+ return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat() # noqa: DTZ007
147
+ return date_str
148
+
149
+
150
+ def _extract_date_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
151
+ if created := pdf_info.get("creationdate") or pdf_info.get("createdate"):
152
+ try:
153
+ date_str = decode_text(created)
154
+ result["created_at"] = _parse_date_string(date_str)
155
+ except (ValueError, IndexError):
156
+ result["created_at"] = decode_text(created)
157
+
158
+ if modified := pdf_info.get("moddate") or pdf_info.get("modificationdate"):
159
+ try:
160
+ date_str = decode_text(modified)
161
+ result["modified_at"] = _parse_date_string(date_str)
162
+ except (ValueError, IndexError):
163
+ result["modified_at"] = decode_text(modified)
164
+
165
+
166
+ def _extract_creator_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
167
+ if creator := pdf_info.get("creator"):
168
+ result["created_by"] = decode_text(creator)
169
+
170
+ if producer := pdf_info.get("producer"):
171
+ producer_str = decode_text(producer)
172
+ if "created_by" not in result:
173
+ result["created_by"] = producer_str
174
+ elif producer_str not in result["created_by"]:
175
+ result["created_by"] = f"{result['created_by']} (Producer: {producer_str})"
176
+
177
+
178
+ def _extract_document_dimensions(document: Document, result: Metadata) -> None:
179
+ first_page = document.pages[0]
180
+ if hasattr(first_page, "width") and hasattr(first_page, "height"):
181
+ result["width"] = int(first_page.width)
182
+ result["height"] = int(first_page.height)
183
+
184
+
185
+ def _format_outline(entries: list[Any], level: int = 0) -> list[str]:
186
+ outline_text: list[str] = []
187
+ for entry in entries:
188
+ if hasattr(entry, "title") and entry.title:
189
+ indent = " " * level
190
+ outline_text.append(f"{indent}- {entry.title}")
191
+ if hasattr(entry, "children") and entry.children:
192
+ _format_outline(entry.children, level + 1)
193
+
194
+ return outline_text
195
+
196
+
197
+ def _generate_outline_description(document: Document) -> str:
198
+ if outline_text := _format_outline(cast("list[Any]", document.outline)):
199
+ return "Table of Contents:\n" + "\n".join(outline_text)
200
+ return ""
201
+
202
+
203
+ def _generate_document_summary(document: Document) -> str:
204
+ summary_parts = []
205
+
206
+ page_count = len(document.pages)
207
+ summary_parts.append(f"PDF document with {page_count} page{'s' if page_count != 1 else ''}.")
208
+
209
+ if hasattr(document, "pdf_version"):
210
+ summary_parts.append(f"PDF version {document.pdf_version}.")
211
+
212
+ if hasattr(document, "is_encrypted") and document.is_encrypted:
213
+ summary_parts.append("Document is encrypted.")
214
+
215
+ if hasattr(document, "encryption_method") and document.encryption_method:
216
+ summary_parts.append(f"Encryption: {document.encryption_method}.")
217
+
218
+ permissions = _collect_document_permissions(document)
219
+ if permissions:
220
+ summary_parts.append(f"Document is {', '.join(permissions)}.")
221
+
222
+ if hasattr(document, "status") and document.status:
223
+ status = decode_text(document.status)
224
+ summary_parts.append(f"Status: {status}.")
225
+
226
+ if hasattr(document, "is_pdf_a") and document.is_pdf_a:
227
+ if hasattr(document, "pdf_a_level") and document.pdf_a_level:
228
+ summary_parts.append(f"PDF/A-{document.pdf_a_level} compliant.")
229
+ else:
230
+ summary_parts.append("PDF/A compliant.")
231
+
232
+ return " ".join(summary_parts)
233
+
234
+
235
+ def _collect_document_permissions(document: Document) -> list[str]:
236
+ permissions = []
237
+ if document.is_printable:
238
+ permissions.append("printable")
239
+ if document.is_modifiable:
240
+ permissions.append("modifiable")
241
+ if document.is_extractable:
242
+ permissions.append("extractable")
243
+ return permissions
244
+
245
+
246
+ def _extract_structure_information(document: Document, result: Metadata) -> None:
247
+ """Extract language and subtitle from document structure."""
248
+ if document.structure:
249
+ languages = set()
250
+ subtitle = None
251
+
252
+ def extract_languages(elements: list[Any]) -> None:
253
+ nonlocal subtitle
254
+ for element in elements:
255
+ if hasattr(element, "language") and element.language:
256
+ languages.add(element.language.lower())
257
+
258
+ if (
259
+ subtitle is None
260
+ and hasattr(element, "role")
261
+ and element.role == "H1"
262
+ and hasattr(element, "text")
263
+ and element.text
264
+ ):
265
+ subtitle = decode_text(element.text)
266
+
267
+ if hasattr(element, "children") and element.children:
268
+ extract_languages(element.children)
269
+
270
+ extract_languages(cast("list[Any]", document.structure))
271
+
272
+ if languages:
273
+ result["languages"] = list(languages)
274
+
275
+ if subtitle and "title" in result and subtitle != result["title"]:
276
+ result["subtitle"] = subtitle
kreuzberg/_registry.py ADDED
@@ -0,0 +1,108 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from typing import TYPE_CHECKING, ClassVar
5
+
6
+ from kreuzberg._extractors._html import HTMLExtractor
7
+ from kreuzberg._extractors._image import ImageExtractor
8
+ from kreuzberg._extractors._pandoc import (
9
+ BibliographyExtractor,
10
+ EbookExtractor,
11
+ LaTeXExtractor,
12
+ MarkdownExtractor,
13
+ MiscFormatExtractor,
14
+ OfficeDocumentExtractor,
15
+ StructuredTextExtractor,
16
+ TabularDataExtractor,
17
+ XMLBasedExtractor,
18
+ )
19
+ from kreuzberg._extractors._pdf import PDFExtractor
20
+ from kreuzberg._extractors._presentation import PresentationExtractor
21
+ from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
22
+
23
+ if TYPE_CHECKING:
24
+ from kreuzberg._extractors._base import Extractor
25
+ from kreuzberg._types import ExtractionConfig
26
+
27
+
28
+ class ExtractorRegistry:
29
+ """Manages extractors for different MIME types and their configurations.
30
+
31
+ This class provides functionality to register, unregister, and retrieve
32
+ extractors based on MIME types. It supports both synchronous and asynchronous
33
+ operations for managing extractors. A default set of extractors is also
34
+ maintained alongside user-registered extractors.
35
+ """
36
+
37
+ _default_extractors: ClassVar[list[type[Extractor]]] = [
38
+ PDFExtractor,
39
+ OfficeDocumentExtractor,
40
+ PresentationExtractor,
41
+ SpreadSheetExtractor,
42
+ HTMLExtractor,
43
+ MarkdownExtractor,
44
+ ImageExtractor,
45
+ BibliographyExtractor,
46
+ EbookExtractor,
47
+ LaTeXExtractor,
48
+ MiscFormatExtractor,
49
+ StructuredTextExtractor,
50
+ TabularDataExtractor,
51
+ XMLBasedExtractor,
52
+ ]
53
+ _registered_extractors: ClassVar[list[type[Extractor]]] = []
54
+
55
+ @classmethod
56
+ @lru_cache
57
+ def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
58
+ """Gets the extractor for the mimetype.
59
+
60
+ Args:
61
+ mime_type: The mime type of the content.
62
+ config: Extraction options object, defaults to the default object.
63
+
64
+ Returns:
65
+ The extractor
66
+ """
67
+ extractors: list[type[Extractor]] = [
68
+ *cls._registered_extractors,
69
+ *cls._default_extractors,
70
+ ]
71
+ if mime_type:
72
+ for extractor in extractors:
73
+ if extractor.supports_mimetype(mime_type):
74
+ return extractor(mime_type=mime_type, config=config)
75
+
76
+ return None
77
+
78
+ @classmethod
79
+ def add_extractor(cls, extractor: type[Extractor]) -> None:
80
+ """Add an extractor to the registry.
81
+
82
+ Note:
83
+ Extractors are tried in the order they are added: first added, first tried.
84
+
85
+ Args:
86
+ extractor: The extractor to add.
87
+
88
+ Returns:
89
+ None
90
+ """
91
+ cls._registered_extractors.append(extractor)
92
+ cls.get_extractor.cache_clear()
93
+
94
+ @classmethod
95
+ def remove_extractor(cls, extractor: type[Extractor]) -> None:
96
+ """Remove an extractor from the registry.
97
+
98
+ Args:
99
+ extractor: The extractor to remove.
100
+
101
+ Returns:
102
+ None
103
+ """
104
+ try:
105
+ cls._registered_extractors.remove(extractor)
106
+ cls.get_extractor.cache_clear()
107
+ except ValueError:
108
+ pass